Topics To Cover

1)Read a CSV Data file with index

Import all necessary libraries

In [1]:
import pandas as pd

Read a csv file: assign variable "Data_csv"

In [2]:
#Read a csv file: assign variable "Data_csv"

Data_csv = pd.read_csv("E:/DataAnalyticsCourse/GittHub/Datasets/Loan_prediction/train.csv")
#Data_csv = pd.read_csv("url")
Data_csv.head()
Out[2]:
Loan_ID Gender Married Dependents Education Self_Employed ApplicantIncome CoapplicantIncome LoanAmount Loan_Amount_Term Credit_History Property_Area Loan_Status
0 LP001002 Male No 0 Graduate No 5849 0.0 NaN 360.0 1.0 Urban Y
1 LP001003 Male Yes 1 Graduate No 4583 1508.0 128.0 360.0 1.0 Rural N
2 LP001005 Male Yes 0 Graduate Yes 3000 0.0 66.0 360.0 1.0 Urban Y
3 LP001006 Male Yes 0 Not Graduate No 2583 2358.0 120.0 360.0 1.0 Urban Y
4 LP001008 Male No 0 Graduate No 6000 0.0 141.0 360.0 1.0 Urban Y
In [5]:
#Suppose if you dont have(want) columns name
Data_csv1 = pd.read_csv("E:/DataAnalyticsCourse/GittHub/Datasets/Loan_prediction/train.csv",header=None)
Data_csv1.head()
Out[5]:
0 1 2 3 4 5 6 7 8 9 10 11 12
0 Loan_ID Gender Married Dependents Education Self_Employed ApplicantIncome CoapplicantIncome LoanAmount Loan_Amount_Term Credit_History Property_Area Loan_Status
1 LP001002 Male No 0 Graduate No 5849 0 NaN 360 1 Urban Y
2 LP001003 Male Yes 1 Graduate No 4583 1508 128 360 1 Rural N
3 LP001005 Male Yes 0 Graduate Yes 3000 0 66 360 1 Urban Y
4 LP001006 Male Yes 0 Not Graduate No 2583 2358 120 360 1 Urban Y
In [10]:
#Suppose if you want to assign new column name as per your wish:
#Assign all col name in a variable
col_name = ["Loan_ID","Gender","Married","Dependents","Education","Self_Employed","ApplicantIncome","CoapplicantIncome","LoanAmount","Loan_Amount_Term","Credit_History","Property_Area","Loan_Status"]
Data_csv2 = pd.read_csv("E:/DataAnalyticsCourse/GittHub/Datasets/Loan_prediction/train.csv",names =col_name)
Data_csv2.head()
Out[10]:
Loan_ID Gender Married Dependents Education Self_Employed ApplicantIncome CoapplicantIncome LoanAmount Loan_Amount_Term Credit_History Property_Area Loan_Status
0 Loan_ID Gender Married Dependents Education Self_Employed ApplicantIncome CoapplicantIncome LoanAmount Loan_Amount_Term Credit_History Property_Area Loan_Status
1 LP001002 Male No 0 Graduate No 5849 0 NaN 360 1 Urban Y
2 LP001003 Male Yes 1 Graduate No 4583 1508 128 360 1 Rural N
3 LP001005 Male Yes 0 Graduate Yes 3000 0 66 360 1 Urban Y
4 LP001006 Male Yes 0 Not Graduate No 2583 2358 120 360 1 Urban Y
In [11]:
#now you can delete the first row of the table
Data_csv2 = Data_csv2.drop(Data_csv2.index[0])
Data_csv2.head()
Out[11]:
Loan_ID Gender Married Dependents Education Self_Employed ApplicantIncome CoapplicantIncome LoanAmount Loan_Amount_Term Credit_History Property_Area Loan_Status
1 LP001002 Male No 0 Graduate No 5849 0 NaN 360 1 Urban Y
2 LP001003 Male Yes 1 Graduate No 4583 1508 128 360 1 Rural N
3 LP001005 Male Yes 0 Graduate Yes 3000 0 66 360 1 Urban Y
4 LP001006 Male Yes 0 Not Graduate No 2583 2358 120 360 1 Urban Y
5 LP001008 Male No 0 Graduate No 6000 0 141 360 1 Urban Y
In [12]:
#Use first column as Index
Data_csv3 = pd.read_csv("E:/DataAnalyticsCourse/GittHub/Datasets/Loan_prediction/train.csv",index_col="Loan_ID")
Data_csv3.head()
Out[12]:
Gender Married Dependents Education Self_Employed ApplicantIncome CoapplicantIncome LoanAmount Loan_Amount_Term Credit_History Property_Area Loan_Status
Loan_ID
LP001002 Male No 0 Graduate No 5849 0.0 NaN 360.0 1.0 Urban Y
LP001003 Male Yes 1 Graduate No 4583 1508.0 128.0 360.0 1.0 Rural N
LP001005 Male Yes 0 Graduate Yes 3000 0.0 66.0 360.0 1.0 Urban Y
LP001006 Male Yes 0 Not Graduate No 2583 2358.0 120.0 360.0 1.0 Urban Y
LP001008 Male No 0 Graduate No 6000 0.0 141.0 360.0 1.0 Urban Y

Read a Text File

In [17]:
columns =['State','Account_Len','Area','Ph_No.','Int_Plan','Vmail_Plan','messgs',
            'tot_day_mins','tot_day_calls','tot_day_chrgs','tot_evening_mins',
            'tot_evening_calls','tot_evening_chrgs','tot_ngt_mins','tot_ngt_calls',
            'tot_ngt_chrgs','tot_int_mins','tot_int_calls','tot_int_chrgs',
'cust_calls_made','churn_status']

Data_txt = pd.read_csv("E:/DataAnalyticsCourse/GittHub/Datasets/Telecom_churn/telecom_churn_data.txt",names=columns)
Data_txt.head()
Out[17]:
State Account_Len Area Ph_No. Int_Plan Vmail_Plan messgs tot_day_mins tot_day_calls tot_day_chrgs ... tot_evening_calls tot_evening_chrgs tot_ngt_mins tot_ngt_calls tot_ngt_chrgs tot_int_mins tot_int_calls tot_int_chrgs cust_calls_made churn_status
0 KS 128 415 382-4657 no yes 25 265.1 110 45.07 ... 99 16.78 244.7 91 11.01 10.0 3 2.70 1 False.
1 OH 107 415 371-7191 no yes 26 161.6 123 27.47 ... 103 16.62 254.4 103 11.45 13.7 3 3.70 1 False.
2 NJ 137 415 358-1921 no no 0 243.4 114 41.38 ... 110 10.30 162.6 104 7.32 12.2 5 3.29 0 False.
3 OH 84 408 375-9999 yes no 0 299.4 71 50.90 ... 88 5.26 196.9 89 8.86 6.6 7 1.78 2 False.
4 OK 75 415 330-6626 yes no 0 166.7 113 28.34 ... 122 12.61 186.9 121 8.41 10.1 3 2.73 3 False.

5 rows × 21 columns

In [19]:
Data_txt.describe()
Out[19]:
Account_Len Area messgs tot_day_mins tot_day_calls tot_day_chrgs tot_evening_mins tot_evening_calls tot_evening_chrgs tot_ngt_mins tot_ngt_calls tot_ngt_chrgs tot_int_mins tot_int_calls tot_int_chrgs cust_calls_made
count 4617.000000 4617.000000 4617.000000 4617.000000 4617.000000 4617.000000 4617.000000 4617.000000 4617.000000 4617.000000 4617.000000 4617.000000 4617.000000 4617.000000 4617.000000 4617.000000
mean 100.645224 437.046350 7.849903 180.447152 100.054364 30.676576 200.429088 100.179770 17.036703 200.623933 99.944120 9.028185 10.279294 4.433831 2.775926 1.567035
std 39.597194 42.288212 13.592333 53.983540 19.883027 9.177145 50.557001 19.821314 4.297332 50.543616 19.935053 2.274488 2.757361 2.457615 0.744413 1.307019
min 1.000000 408.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 23.200000 12.000000 1.040000 0.000000 0.000000 0.000000 0.000000
25% 74.000000 408.000000 0.000000 143.700000 87.000000 24.430000 165.900000 87.000000 14.100000 167.100000 87.000000 7.520000 8.600000 3.000000 2.320000 1.000000
50% 100.000000 415.000000 0.000000 180.000000 100.000000 30.600000 200.800000 101.000000 17.070000 200.800000 100.000000 9.040000 10.300000 4.000000 2.780000 1.000000
75% 127.000000 510.000000 17.000000 216.800000 113.000000 36.860000 234.000000 114.000000 19.890000 234.900000 113.000000 10.570000 12.100000 6.000000 3.270000 2.000000
max 243.000000 510.000000 51.000000 351.500000 165.000000 59.760000 363.700000 170.000000 30.910000 395.000000 175.000000 17.770000 20.000000 20.000000 5.400000 9.000000
In [20]:
Data_txt.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4617 entries, 0 to 4616
Data columns (total 21 columns):
State                4617 non-null object
Account_Len          4617 non-null int64
Area                 4617 non-null int64
Ph_No.               4617 non-null object
Int_Plan             4617 non-null object
Vmail_Plan           4617 non-null object
messgs               4617 non-null int64
tot_day_mins         4617 non-null float64
tot_day_calls        4617 non-null int64
tot_day_chrgs        4617 non-null float64
tot_evening_mins     4617 non-null float64
tot_evening_calls    4617 non-null int64
tot_evening_chrgs    4617 non-null float64
tot_ngt_mins         4617 non-null float64
tot_ngt_calls        4617 non-null int64
tot_ngt_chrgs        4617 non-null float64
tot_int_mins         4617 non-null float64
tot_int_calls        4617 non-null int64
tot_int_chrgs        4617 non-null float64
cust_calls_made      4617 non-null int64
churn_status         4617 non-null object
dtypes: float64(8), int64(8), object(5)
memory usage: 667.3+ KB
In [ ]:
#Observations:Int_Plan and Vmail_Plan
In [28]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.rcParams["figure.figsize"]=(10,5)

Data Visualization

In [29]:
sns.pairplot(Data_txt,hue="churn_status")
Out[29]:
<seaborn.axisgrid.PairGrid at 0xdbe7430>
In [42]:
sns.distplot(Data_txt.tot_day_chrgs)
plt.show()
sns.distplot(Data_txt.tot_evening_chrgs)
plt.show()
sns.distplot(Data_txt.tot_ngt_chrgs)
plt.show()
sns.distplot(Data_txt.tot_int_chrgs)
plt.show()
C:\Users\niran\Anaconda3\lib\site-packages\statsmodels\nonparametric\kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
In [45]:
#checking outliers
sns.boxplot(x="State",y="tot_int_chrgs",hue ="churn_status",data=Data_txt)
Out[45]:
<matplotlib.axes._subplots.AxesSubplot at 0x2b60be50>

Data Cleaning

In [ ]:
# Assuming Area and Phone no. is irrelavent as of now, so we can drop it
#Int_Plan,Vmail_Plan and churn_status is objective, we can change it to boolian
In [55]:
Data_txt["Int_Plan"] = np.where(Data_txt.Int_Plan =="yes",1,0)
Data_txt["Vmail_Plan"] = np.where(Data_txt.Vmail_Plan =="yes",1,0)
Data_txt["churn_status"] = np.where(Data_txt.churn_status =="True",1,0)
In [73]:
Data_txt.head()
Out[73]:
State Account_Len Area Int_Plan Vmail_Plan messgs tot_day_mins tot_day_calls tot_day_chrgs tot_evening_mins tot_evening_calls tot_evening_chrgs tot_ngt_mins tot_ngt_calls tot_ngt_chrgs tot_int_mins tot_int_calls tot_int_chrgs cust_calls_made churn_status
0 KS 128 415 0 0 25 265.1 110 45.07 197.4 99 16.78 244.7 91 11.01 10.0 3 2.70 1 0
1 OH 107 415 0 0 26 161.6 123 27.47 195.5 103 16.62 254.4 103 11.45 13.7 3 3.70 1 0
2 NJ 137 415 0 0 0 243.4 114 41.38 121.2 110 10.30 162.6 104 7.32 12.2 5 3.29 0 0
3 OH 84 408 0 0 0 299.4 71 50.90 61.9 88 5.26 196.9 89 8.86 6.6 7 1.78 2 0
4 OK 75 415 0 0 0 166.7 113 28.34 148.3 122 12.61 186.9 121 8.41 10.1 3 2.73 3 0
In [74]:
#Data_txt = Data_txt.drop(["Ph_No."], axis=1)
Data_txt = Data_txt.drop(["State"], axis=1)
In [105]:
Data_txt.tail()
Out[105]:
Account_Len Area Int_Plan Vmail_Plan messgs tot_day_mins tot_day_calls tot_day_chrgs tot_evening_mins tot_evening_calls tot_evening_chrgs tot_ngt_mins tot_ngt_calls tot_ngt_chrgs tot_int_mins tot_int_calls tot_int_chrgs cust_calls_made churn_status
4612 57 510 0 0 25 144.0 81 24.48 187.2 112 15.91 158.6 122 7.14 8.5 6 2.30 3 0
4613 177 408 0 0 29 189.0 91 32.13 303.1 96 25.76 163.6 116 7.36 15.7 1 4.24 3 0
4614 67 408 0 0 33 127.5 126 21.68 296.1 129 25.17 200.9 91 9.04 13.0 3 3.51 1 0
4615 98 415 0 0 23 168.9 98 28.71 226.3 117 19.24 165.5 96 7.45 14.3 3 3.86 0 0
4616 140 415 0 0 0 204.7 100 34.80 126.8 107 10.78 202.8 115 9.13 12.1 4 3.27 2 0
In [118]:
from sklearn.model_selection import train_test_split
X=Data_txt.drop(["churn_status"], axis=1)
y=Data_txt.churn_status
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.8)
In [117]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
(2308, 18)
(2309, 18)
(2308,)
(2309,)

Logistic Regression

# An erreo i am geeting because in the data set one or more label value is only one class, for analysis we need atleast two class from sklearn.linear_model import LogisticRegression logreg = LogisticRegression() logreg.fit(X_train,y_train) logreg_pred = logreg.predict(X_test)
In [126]:
# here the lable value is onlt one class ie Zero
y_train.head()
Out[126]:
1169    0
3128    0
3040    0
544     0
4443    0
Name: churn_status, dtype: int32
In [128]:
# we have to we have to identify the unique values in that label and clean the data
Data_txt["Int_Plan"].unique()
Out[128]:
array([0], dtype=int64)
In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.rcParams["figure.figsize"]=(10,5)

columns =['State','Account_Len','Area','Ph_No.','Int_Plan','Vmail_Plan','messgs',
            'tot_day_mins','tot_day_calls','tot_day_chrgs','tot_evening_mins',
            'tot_evening_calls','tot_evening_chrgs','tot_ngt_mins','tot_ngt_calls',
            'tot_ngt_chrgs','tot_int_mins','tot_int_calls','tot_int_chrgs',
'cust_calls_made','churn_status']

Data_txt2 = pd.read_csv("E:/DataAnalyticsCourse/GittHub/Datasets/Telecom_churn/telecom_churn_data.txt",names=columns)
In [162]:
Data_txt2.Int_Plan.unique()
Out[162]:
array([' no', ' yes'], dtype=object)
In [171]:
#Here we found there is a space prifix in the value
# lets same try for the Vmail_Plan
print("Vmail_Plan:",Data_txt2.Vmail_Plan.unique())
print("churn_status: ",Data_txt2.churn_status.unique())
# we have found the same issue
Vmail_Plan: [' yes' ' no']
churn_status:  [' False.' ' True.']
In [2]:
# short out the issue

Data_txt2["Int_Plan"] = np.where(Data_txt2.Int_Plan ==" yes",1,0)
Data_txt2["Vmail_Plan"] = np.where(Data_txt2.Vmail_Plan ==" yes",1,0)
Data_txt2["churn_status"] = np.where(Data_txt2.churn_status ==" True.",1,0)
In [3]:
# Now check the unique values
print(Data_txt2.Int_Plan.unique())
print(Data_txt2.Vmail_Plan.unique())
print(Data_txt2.churn_status.unique())
[0 1]
[1 0]
[0 1]
In [4]:
Data_txt2.head()
Out[4]:
State Account_Len Area Ph_No. Int_Plan Vmail_Plan messgs tot_day_mins tot_day_calls tot_day_chrgs ... tot_evening_calls tot_evening_chrgs tot_ngt_mins tot_ngt_calls tot_ngt_chrgs tot_int_mins tot_int_calls tot_int_chrgs cust_calls_made churn_status
0 KS 128 415 382-4657 0 1 25 265.1 110 45.07 ... 99 16.78 244.7 91 11.01 10.0 3 2.70 1 0
1 OH 107 415 371-7191 0 1 26 161.6 123 27.47 ... 103 16.62 254.4 103 11.45 13.7 3 3.70 1 0
2 NJ 137 415 358-1921 0 0 0 243.4 114 41.38 ... 110 10.30 162.6 104 7.32 12.2 5 3.29 0 0
3 OH 84 408 375-9999 1 0 0 299.4 71 50.90 ... 88 5.26 196.9 89 8.86 6.6 7 1.78 2 0
4 OK 75 415 330-6626 1 0 0 166.7 113 28.34 ... 122 12.61 186.9 121 8.41 10.1 3 2.73 3 0

5 rows × 21 columns

In [9]:
plt.rcParams['figure.figsize']=(15,6)
sns.countplot(x="State", data=Data_txt2)
Out[9]:
<matplotlib.axes._subplots.AxesSubplot at 0x570a870>
In [12]:
sns.barplot(x="State",y="churn_status" ,data=Data_txt2)
Out[12]:
<matplotlib.axes._subplots.AxesSubplot at 0xe206530>
In [13]:
feature_cols =['Account_Len','Area','Int_Plan','Vmail_Plan','messgs',
            'tot_day_mins','tot_day_calls','tot_day_chrgs','tot_evening_mins',
            'tot_evening_calls','tot_evening_chrgs','tot_ngt_mins','tot_ngt_calls',
            'tot_ngt_chrgs','tot_int_mins','tot_int_calls','tot_int_chrgs',
'cust_calls_made']

X = Data_txt2[feature_cols]
y = Data_txt2.churn_status
In [22]:
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest= train_test_split(X,y,test_size=0.2,random_state=45)
In [23]:
print(xtrain.shape)
print(xtest.shape)
print(ytrain.shape)
print(ytest.shape)
(3693, 18)
(924, 18)
(3693,)
(924,)

# Perform Logistic Regression

In [26]:
# Perform Logistic Regression

from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(xtrain,ytrain)
logreg_pred = logreg.predict(xtest)
In [28]:
from sklearn.metrics import accuracy_score
from sklearn import metrics
from sklearn.metrics import confusion_matrix
In [30]:
# check the model Score and accuracy

Model_log = round(logreg.score(xtrain,ytrain)*100,2)
print("Model Score : " ,Model_log)
Acc_log = accuracy_score(logreg_pred,ytest)
print("Acc_Score : ", Acc_log)
Model Score :  86.73
Acc_Score :  0.86038961039
In [33]:
#confusion Matrix
from sklearn import metrics
cnf_metrix = (metrics.confusion_matrix(ytest,logreg_pred))
cmap = sns.cubehelix_palette(50, hue=0.5, rot=0, light=0.9, dark=0, as_cmap=True)
sns.heatmap(cnf_metrix,cmap = cmap,xticklabels=['0','1'],yticklabels=['0','1'],annot=True, fmt="d",)
plt.xlabel('Predicted')
plt.ylabel('Actual')
Out[33]:
<matplotlib.text.Text at 0xeae8810>
In [34]:
# Cross Validation

from sklearn.cross_validation import cross_val_score
scores = cross_val_score(logreg, X, y, cv=5, scoring='accuracy') #fitting logistic regression to whole data with 5 fold
print("Log:",scores)
print("Log:" ,scores.mean())
C:\Users\niran\Anaconda3\lib\site-packages\sklearn\cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
Log: [ 0.87135135  0.85373781  0.87107259  0.86023835  0.86023835]
Log: 0.863327691722

Perform Decision Tree

In [44]:
from sklearn import tree
tree = tree.DecisionTreeClassifier(criterion="entropy",max_depth=3)

tree.fit(xtrain,ytrain)
tree_Pred = tree.predict(xtest).astype(int)



Model_tree = round(tree.score(xtrain,ytrain)*100,2)
print("Model Score : " ,Model_tree)
Acc_tree = accuracy_score(tree_Pred,ytest,normalize=True)
print("Acc_Score : ", Acc_tree)

#Confusion Metrix 

from sklearn import metrics
cnf_metrix = (metrics.confusion_matrix(ytest,tree_Pred))
cmap = sns.cubehelix_palette(50, hue=0.5, rot=0, light=0.9, dark=0, as_cmap=True)
sns.heatmap(cnf_metrix,cmap = cmap,xticklabels=['0','1'],yticklabels=['0','1'],annot=True, fmt="d",)
plt.xlabel('Predicted')
plt.ylabel('Actual')

# Cross-Validation
scores = cross_val_score(tree, X, y, cv=5, scoring='accuracy') #fitting Decision Tree to whole data with 5 fold
print("Log:",scores)
print("Log:" ,scores.mean())
Model Score :  90.85
Acc_Score :  0.898268398268
Log: [ 0.9027027   0.90574215  0.89057421  0.91224269  0.8992416 ]
Log: 0.902100670551

Perform Random Forest Tree

In [47]:
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(max_depth = 3, min_samples_split=2, n_estimators = 200, random_state = 1)

forest.fit(xtrain,ytrain)
forest_Pred = forest.predict(xtest).astype(int)

Model_forest = round(forest.score(xtrain,ytrain)*100,2)
print("Model Score : " ,Model_forest)
Acc_forest = metrics.accuracy_score(forest_Pred,ytest,normalize=True)
print("Acc_Score : ", Acc_forest)

#Confusion Metrix 

from sklearn import metrics
cnf_metrix = (metrics.confusion_matrix(ytest,forest_Pred))
cmap = sns.cubehelix_palette(50, hue=0.5, rot=0, light=0.9, dark=0, as_cmap=True)
sns.heatmap(cnf_metrix,cmap = cmap,xticklabels=['0','1'],yticklabels=['0','1'],annot=True, fmt="d",)
plt.xlabel('Predicted')
plt.ylabel('Actual')


#Cross Validation
# Cross-Validation
scores = cross_val_score(forest, X, y, cv=5, scoring='accuracy') #fitting Decision Tree to whole data with 5 fold
print("Log:",scores)
print("Log:" ,scores.mean())
Model Score :  89.2
Acc_Score :  0.878787878788
Log: [ 0.88648649  0.87757313  0.88624052  0.88190683  0.88624052]
Log: 0.883689496647

PCA

In [105]:
from sklearn.decomposition import PCA 
In [ ]: